started: Alexey Larionov, Feb2017
last updated: Alexey Larionov, 05May2017

Summary

Select 93 genes:

83 genes:
- skat_M burden_svt_p <= 0.05
- aggregated exac-NFE AF <= 0.05
- crude trend call = “risk”

10 genes w/o exac data:
- skat_M burden_svt_p <= 0.05

Explore selected genes
Save selected genes to text file

Plot trend for each selected gene

start_section

# Start time
Sys.time()
## [1] "2017-05-05 22:10:52 BST"
# Folders
library(knitr)
base_folder="/analysis/mtgroup_share/users/alexey/wecare_only_stat_05.17"
opts_knit$set(root.dir = base_folder)
#setwd(base_folder)

load_data

load(paste(base_folder, "results", "r09b_SKAT_M_wecare_only.RData", sep="/"))
#base_folder="/analysis/mtgroup_share/users/alexey/wecare_only_stat_05.17"

load(paste(base_folder, "results", "r10b_invert_aggregate_kgen_exac.RData", sep="/"))
#base_folder="/analysis/mtgroup_share/users/alexey/wecare_only_stat_05.17"

check_data

ls()
##  [1] "base_folder"             "exac.df"                
##  [3] "genes_aggr_data.mx"      "genes_aggr_exac.df"     
##  [5] "genes_aggr_info.df"      "genes_aggr_kgen.df"     
##  [7] "genes_aggr_skat_M.df"    "genotypes_inv_imp_wt.mx"
##  [9] "genotypes.mx"            "kgen.df"                
## [11] "phenotypes.df"           "variants.df"
dim(genotypes.mx)
## [1] 18551   478
class(genotypes.mx)
## [1] "matrix"
genotypes.mx[1:5,1:5]
##              P1_A01 P1_A02 P1_A03 P1_A04 P1_A05
## Var000000026      0     NA      0      0      0
## Var000000029      0      0      0      0      0
## Var000000031      0      0      0      0      0
## Var000000053      0      0      0      0     NA
## Var000000064      0      0      0      0     NA
dim(genotypes_inv_imp_wt.mx)
## [1] 18551   478
class(genotypes_inv_imp_wt.mx)
## [1] "matrix"
genotypes_inv_imp_wt.mx[1:5,1:5]
##              P1_A01     P1_A02 P1_A03 P1_A04     P1_A05
## Var000000026      0 0.05482456      0      0 0.00000000
## Var000000029      0 0.00000000      0      0 0.00000000
## Var000000031      0 0.00000000      0      0 0.00000000
## Var000000053      0 0.00000000      0      0 0.05707763
## Var000000064      0 0.00000000      0      0 0.05518764
dim(genes_aggr_data.mx)
## [1] 9117  478
class(genes_aggr_data.mx)
## [1] "matrix"
genes_aggr_data.mx[1:5,1:5]
##        P1_A01     P1_A02     P1_A03   P1_A04    P1_A05
## NOC2L       0 0.05482456 0.00000000 0.000000 0.0000000
## KLHL17      0 0.00000000 0.00000000 0.000000 0.1122653
## AGRN        0 0.65431621 0.00000000 8.602334 0.0000000
## TTLL10      0 0.00000000 0.00000000 0.000000 0.0000000
## PUSL1       0 0.05821459 0.05821459 0.000000 0.0000000
dim(genes_aggr_skat_M.df)
## [1] 9109   12
str(genes_aggr_skat_M.df)
## 'data.frame':    9109 obs. of  12 variables:
##  $ gene              : chr  "NOC2L" "KLHL17" "AGRN" "TTLL10" ...
##  $ num_var           : int  3 2 3 1 1 1 1 1 2 1 ...
##  $ svt_p             : num  NA NA NA 0.346 0.711 ...
##  $ svt_is_accurate   : logi  NA NA NA TRUE TRUE TRUE ...
##  $ svt_map           : num  NA NA NA 0.0912 0.2106 ...
##  $ burden_p          : num  0.533 0.343 0.431 NA NA ...
##  $ burden_is_accurate: logi  TRUE TRUE TRUE NA NA NA ...
##  $ burden_map        : num  0.0135 0.0871 -1 NA NA ...
##  $ skat_p            : num  0.465 0.831 0.344 NA NA ...
##  $ skat_is_accurate  : logi  TRUE TRUE TRUE NA NA NA ...
##  $ skat_map          : num  0.0118 0.0871 -1 NA NA ...
##  $ aggregated_MAC    : num  4 2 36 2 1 1 1 41 3 1 ...
genes_aggr_skat_M.df[1:5,1:5]
##          gene num_var     svt_p svt_is_accurate    svt_map
## NOC2L   NOC2L       3        NA              NA         NA
## KLHL17 KLHL17       2        NA              NA         NA
## AGRN     AGRN       3        NA              NA         NA
## TTLL10 TTLL10       1 0.3459301            TRUE 0.09120194
## PUSL1   PUSL1       1 0.7106340            TRUE 0.21063399
dim(genes_aggr_info.df)
## [1] 9117   27
str(genes_aggr_info.df)
## 'data.frame':    9117 obs. of  27 variables:
##  $ gene             : chr  "NOC2L" "KLHL17" "AGRN" "TTLL10" ...
##  $ num_var          : num  3 2 3 1 1 1 1 1 2 1 ...
##  $ inverted         : logi  FALSE FALSE FALSE FALSE FALSE FALSE ...
##  $ multiallelic     : logi  FALSE FALSE FALSE FALSE FALSE FALSE ...
##  $ aggr_ac          : num  4 2 36 2 1 1 1 41 3 1 ...
##  $ aggr_an          : num  2808 1782 2792 956 810 ...
##  $ aggr_af          : num  0.00142 0.00112 0.01289 0.00209 0.00123 ...
##  $ aggr_ac_cbc      : num  3 2 15 2 0 1 1 20 2 0 ...
##  $ aggr_an_cbc      : num  1358 870 1360 466 400 ...
##  $ aggr_af_cbc      : num  0.00221 0.0023 0.01103 0.00429 0 ...
##  $ aggr_ac_ubc      : num  1 0 21 0 1 0 0 21 1 1 ...
##  $ aggr_an_ubc      : num  1450 912 1432 490 410 ...
##  $ aggr_af_ubc      : num  0.00069 0 0.01466 0 0.00244 ...
##  $ aggr_ac_cbc_fam  : num  1 0 5 2 0 0 1 9 0 0 ...
##  $ aggr_an_cbc_fam  : num  482 306 478 166 144 162 164 164 318 150 ...
##  $ aggr_af_cbc_fam  : num  0.00207 0 0.01046 0.01205 0 ...
##  $ aggr_ac_cbc_nofam: num  2 2 10 0 0 1 0 11 2 0 ...
##  $ aggr_an_cbc_nofam: num  876 564 882 300 256 298 296 300 566 290 ...
##  $ aggr_af_cbc_nofam: num  0.00228 0.00355 0.01134 0 0 ...
##  $ aggr_ac_ubc_fam  : num  0 0 4 0 1 0 0 8 0 1 ...
##  $ aggr_an_ubc_fam  : num  418 262 414 142 120 140 142 142 278 132 ...
##  $ aggr_af_ubc_fam  : num  0 0 0.00966 0 0.00833 ...
##  $ aggr_ac_ubc_nofam: num  1 0 17 0 0 0 0 13 1 0 ...
##  $ aggr_an_ubc_nofam: num  1032 650 1018 348 290 ...
##  $ aggr_af_ubc_nofam: num  0.000969 0 0.016699 0 0 ...
##  $ cbc_ubc_call     : Factor w/ 3 levels "protective","risk",..: 2 2 1 2 1 2 2 2 2 1 ...
##  $ cbc_ubc_fisher_p : num  0.359 0.238 0.408 0.237 1 ...
genes_aggr_info.df[1:5,1:5]
##          gene num_var inverted multiallelic aggr_ac
## NOC2L   NOC2L       3    FALSE        FALSE       4
## KLHL17 KLHL17       2    FALSE        FALSE       2
## AGRN     AGRN       3    FALSE        FALSE      36
## TTLL10 TTLL10       1    FALSE        FALSE       2
## PUSL1   PUSL1       1    FALSE        FALSE       1
dim(genes_aggr_kgen.df)
## [1] 9117   16
str(genes_aggr_kgen.df)
## 'data.frame':    9117 obs. of  16 variables:
##  $ gene              : chr  "NOC2L" "KLHL17" "AGRN" "TTLL10" ...
##  $ num_var           : num  3 2 3 1 1 1 1 1 2 1 ...
##  $ inverted          : logi  FALSE FALSE FALSE FALSE FALSE FALSE ...
##  $ multiallelic      : logi  FALSE FALSE FALSE FALSE FALSE FALSE ...
##  $ ac_kgen           : num  2 NA 92 NA 6 NA 4 371 9 NA ...
##  $ an_kgen           : num  10016 NA 5008 NA 5008 ...
##  $ af_kgen           : num  0.0002 NA 0.0184 NA 0.0012 ...
##  $ ac_ubc            : num  1 0 21 0 1 0 0 21 1 1 ...
##  $ an_ubc            : num  1450 912 1432 490 410 ...
##  $ af_ubc            : num  0.00069 0 0.01466 0 0.00244 ...
##  $ ac_cbc            : num  3 2 15 2 0 1 1 20 2 0 ...
##  $ an_cbc            : num  1358 870 1360 466 400 ...
##  $ af_cbc            : num  0.00221 0.0023 0.01103 0.00429 0 ...
##  $ pearson_r         : num  0.959 NA -1 NA -0.491 ...
##  $ trend_call        : Factor w/ 3 levels "protective","risk",..: 2 NA 1 NA 1 NA 2 1 2 NA ...
##  $ crude_trend_test_p: num  0.00168 NA 0.04823 NA 0.75384 ...
genes_aggr_kgen.df[1:5,1:5]
##          gene num_var inverted multiallelic ac_kgen
## NOC2L   NOC2L       3    FALSE        FALSE       2
## KLHL17 KLHL17       2    FALSE        FALSE      NA
## AGRN     AGRN       3    FALSE        FALSE      92
## TTLL10 TTLL10       1    FALSE        FALSE      NA
## PUSL1   PUSL1       1    FALSE        FALSE       6
dim(genes_aggr_exac.df)
## [1] 9117   16
str(genes_aggr_exac.df)
## 'data.frame':    9117 obs. of  16 variables:
##  $ gene              : chr  "NOC2L" "KLHL17" "AGRN" "TTLL10" ...
##  $ num_var           : num  3 2 3 1 1 1 1 1 2 1 ...
##  $ inverted          : logi  FALSE FALSE FALSE FALSE FALSE FALSE ...
##  $ multiallelic      : logi  FALSE FALSE FALSE FALSE FALSE FALSE ...
##  $ ac_exac_NFE       : num  14 2 2193 1 32 ...
##  $ an_exac_NFE       : num  108598 108618 74336 4760 54314 ...
##  $ af_exac_NFE       : num  1.29e-04 1.84e-05 2.95e-02 2.10e-04 5.89e-04 ...
##  $ ac_ubc            : num  1 0 21 0 1 0 0 21 1 1 ...
##  $ an_ubc            : num  1450 912 1432 490 410 ...
##  $ af_ubc            : num  0.00069 0 0.01466 0 0.00244 ...
##  $ ac_cbc            : num  3 2 15 2 0 1 1 20 2 0 ...
##  $ an_cbc            : num  1358 870 1360 466 400 ...
##  $ af_cbc            : num  0.00221 0.0023 0.01103 0.00429 0 ...
##  $ pearson_r         : num  0.966 0.863 -0.944 0.844 -0.231 ...
##  $ trend_call        : Factor w/ 3 levels "protective","risk",..: 2 2 1 2 1 2 2 2 2 1 ...
##  $ crude_trend_test_p: num  1.28e-09 6.31e-23 4.40e-07 1.49e-03 8.00e-01 ...
genes_aggr_exac.df[1:5,1:5]
##          gene num_var inverted multiallelic ac_exac_NFE
## NOC2L   NOC2L       3    FALSE        FALSE          14
## KLHL17 KLHL17       2    FALSE        FALSE           2
## AGRN     AGRN       3    FALSE        FALSE        2193
## TTLL10 TTLL10       1    FALSE        FALSE           1
## PUSL1   PUSL1       1    FALSE        FALSE          32
dim(kgen.df)
## [1] 18551     9
colnames(kgen.df)
## [1] "SplitVarID"  "kgen.AC"     "kgen.AN"     "kgen.AF"     "kgen.AFR_AF"
## [6] "kgen.AMR_AF" "kgen.EAS_AF" "kgen.EUR_AF" "kgen.SAS_AF"
kgen.df[1:5,1:5]
##                SplitVarID kgen.AC kgen.AN     kgen.AF kgen.AFR_AF
## Var000000026 Var000000026      NA      NA          NA          NA
## Var000000029 Var000000029       1    5008 0.000199681           0
## Var000000031 Var000000031       1    5008 0.000199681           0
## Var000000053 Var000000053      NA      NA          NA          NA
## Var000000064 Var000000064      NA      NA          NA          NA
dim(exac.df)
## [1] 18551    48
colnames(exac.df)
##  [1] "SplitVarID"              "exac_non_TCGA.AF"       
##  [3] "exac_non_TCGA.AC"        "exac_non_TCGA.AN"       
##  [5] "exac_non_TCGA.AC_FEMALE" "exac_non_TCGA.AN_FEMALE"
##  [7] "exac_non_TCGA.AC_MALE"   "exac_non_TCGA.AN_MALE"  
##  [9] "exac_non_TCGA.AC_Adj"    "exac_non_TCGA.AN_Adj"   
## [11] "exac_non_TCGA.AC_Hom"    "exac_non_TCGA.AC_Het"   
## [13] "exac_non_TCGA.AC_Hemi"   "exac_non_TCGA.AC_AFR"   
## [15] "exac_non_TCGA.AN_AFR"    "exac_non_TCGA.Hom_AFR"  
## [17] "exac_non_TCGA.Het_AFR"   "exac_non_TCGA.Hemi_AFR" 
## [19] "exac_non_TCGA.AC_AMR"    "exac_non_TCGA.AN_AMR"   
## [21] "exac_non_TCGA.Hom_AMR"   "exac_non_TCGA.Het_AMR"  
## [23] "exac_non_TCGA.Hemi_AMR"  "exac_non_TCGA.AC_EAS"   
## [25] "exac_non_TCGA.AN_EAS"    "exac_non_TCGA.Hom_EAS"  
## [27] "exac_non_TCGA.Het_EAS"   "exac_non_TCGA.Hemi_EAS" 
## [29] "exac_non_TCGA.AC_FIN"    "exac_non_TCGA.AN_FIN"   
## [31] "exac_non_TCGA.Hom_FIN"   "exac_non_TCGA.Het_FIN"  
## [33] "exac_non_TCGA.Hemi_FIN"  "exac_non_TCGA.AC_NFE"   
## [35] "exac_non_TCGA.AN_NFE"    "exac_non_TCGA.Hom_NFE"  
## [37] "exac_non_TCGA.Het_NFE"   "exac_non_TCGA.Hemi_NFE" 
## [39] "exac_non_TCGA.AC_SAS"    "exac_non_TCGA.AN_SAS"   
## [41] "exac_non_TCGA.Hom_SAS"   "exac_non_TCGA.Het_SAS"  
## [43] "exac_non_TCGA.Hemi_SAS"  "exac_non_TCGA.AC_OTH"   
## [45] "exac_non_TCGA.AN_OTH"    "exac_non_TCGA.Hom_OTH"  
## [47] "exac_non_TCGA.Het_OTH"   "exac_non_TCGA.Hemi_OTH"
exac.df[1:5,1:5]
##                SplitVarID exac_non_TCGA.AF exac_non_TCGA.AC
## Var000000026 Var000000026               NA               NA
## Var000000029 Var000000029        1.506e-04               16
## Var000000031 Var000000031        2.354e-04               25
## Var000000053 Var000000053        1.883e-05                2
## Var000000064 Var000000064        9.416e-06                1
##              exac_non_TCGA.AN exac_non_TCGA.AC_FEMALE
## Var000000026               NA                      NA
## Var000000029           106210                       8
## Var000000031           106210                       4
## Var000000053           106210                       0
## Var000000064           106206                       0
dim(variants.df)
## [1] 18551    67
str(variants.df)
## 'data.frame':    18551 obs. of  67 variables:
##  $ SplitVarID        : Factor w/ 343824 levels "Var000000001",..: 26 29 31 53 64 167 227 237 300 400 ...
##  $ SYMBOL            : Factor w/ 19862 levels "A1BG","A1CF",..: 11149 11149 11149 8665 8665 540 540 540 18233 13545 ...
##  $ TYPE              : Factor w/ 2 levels "INDEL","SNP": 2 2 2 2 2 2 2 2 2 2 ...
##  $ CHROM             : Factor w/ 25 levels "1","10","11",..: 1 1 1 1 1 1 1 1 1 1 ...
##  $ POS               : int  881871 883883 883946 897287 897792 979748 987173 989899 1132513 1246373 ...
##  $ REF               : Factor w/ 2142 levels "A","AAAAAAAAAAAAAAAAAAAAAAAG",..: 1044 1575 539 539 539 1 1575 539 539 1044 ...
##  $ ALT               : Factor w/ 1297 levels "A","AAAAAAAAAAAAACT",..: 1 323 989 989 989 989 664 989 989 1 ...
##  $ ac_raw            : int  1 2 1 1 1 53 1 1 2 1 ...
##  $ af_raw            : num  0.000711 0.001555 0.000774 0.000711 0.000722 ...
##  $ an_raw            : int  1406 1286 1292 1406 1386 1416 1420 1414 1320 1400 ...
##  $ Consequence       : Factor w/ 78 levels "3_prime_UTR_variant",..: 66 28 28 28 66 28 28 28 28 28 ...
##  $ SIFT_call         : Factor w/ 4 levels "deleterious",..: NA 1 1 1 NA 1 1 1 1 1 ...
##  $ SIFT_score        : num  NA 0 0.01 0.04 NA 0.01 0 0.02 0.02 0.05 ...
##  $ PolyPhen_call     : Factor w/ 4 levels "benign","possibly_damaging",..: NA 3 3 3 NA 3 3 3 3 3 ...
##  $ PolyPhen_score    : num  NA 0.936 0.946 0.997 NA 0.932 0.999 0.991 0.999 0.988 ...
##  $ CLIN_SIG          : Factor w/ 120 levels "association",..: NA NA NA NA NA 2 NA NA NA NA ...
##  $ cDNA_position     : Factor w/ 16585 levels "1","?-1","10",..: 4302 3741 3476 13013 15367 5539 11506 11969 4210 15212 ...
##  $ CDS_position      : Factor w/ 15333 levels "1","?-1","10",..: 3744 3218 2977 10599 13061 4871 10484 10919 3279 13242 ...
##  $ Codons            : Factor w/ 3014 levels "-/A","-/AA","AAA/-",..: 1025 1858 1306 1172 1300 1864 2503 374 328 1877 ...
##  $ Protein_position  : Factor w/ 7360 levels "1","?-1","10",..: 6008 5707 5569 1723 2739 6606 1672 1845 5745 2811 ...
##  $ Amino_acids       : Factor w/ 1851 levels "-/*","*","A",..: 1171 334 1303 1134 1283 381 1809 1594 1578 225 ...
##  $ Existing_variation: Factor w/ 309385 levels "","FANCA:c.2534T>C",..: NA 54731 225315 291348 274420 10329 268628 74468 292466 49490 ...
##  $ Multiallelic      : logi  FALSE FALSE FALSE FALSE FALSE FALSE ...
##  $ ac_all            : int  1 2 1 1 1 34 1 1 2 1 ...
##  $ an_all            : num  912 942 954 876 906 894 954 944 956 810 ...
##  $ af_all            : num  0.0011 0.00212 0.00105 0.00114 0.0011 ...
##  $ ac_ubc            : int  0 0 1 0 0 20 0 1 0 1 ...
##  $ an_ubc            : num  472 488 490 450 462 460 488 484 490 410 ...
##  $ af_ubc            : num  0 0 0.00204 0 0 ...
##  $ ac_cbc            : int  1 2 0 1 1 14 1 0 2 0 ...
##  $ an_cbc            : num  440 454 464 426 444 434 466 460 466 400 ...
##  $ af_cbc            : num  0.00227 0.00441 0 0.00235 0.00225 ...
##  $ ac_ubc_fam        : int  0 0 0 0 0 4 0 0 0 1 ...
##  $ an_ubc_fam        : num  134 142 142 130 132 132 142 140 142 120 ...
##  $ af_ubc_fam        : num  0 0 0 0 0 ...
##  $ ac_ubc_nofam      : int  0 0 1 0 0 16 0 1 0 0 ...
##  $ an_ubc_nofam      : num  338 346 348 320 330 328 346 344 348 290 ...
##  $ af_ubc_nofam      : num  0 0 0.00287 0 0 ...
##  $ ac_cbc_fam        : int  0 1 0 0 0 5 0 0 2 0 ...
##  $ an_cbc_fam        : num  156 162 164 152 154 148 166 164 166 144 ...
##  $ af_cbc_fam        : num  0 0.00617 0 0 0 ...
##  $ ac_cbc_nofam      : int  1 1 0 1 1 9 1 0 0 0 ...
##  $ an_cbc_nofam      : num  284 292 300 274 290 286 300 296 300 256 ...
##  $ af_cbc_nofam      : num  0.00352 0.00342 0 0.00365 0.00345 ...
##  $ inverted          : logi  FALSE FALSE FALSE FALSE FALSE FALSE ...
##  $ ac_inv            : num  1 2 1 1 1 34 1 1 2 1 ...
##  $ an_inv            : num  912 942 954 876 906 894 954 944 956 810 ...
##  $ af_inv            : num  0.0011 0.00212 0.00105 0.00114 0.0011 ...
##  $ ac_cbc_inv        : num  1 2 0 1 1 14 1 0 2 0 ...
##  $ an_cbc_inv        : num  440 454 464 426 444 434 466 460 466 400 ...
##  $ af_cbc_inv        : num  0.00227 0.00441 0 0.00235 0.00225 ...
##  $ ac_ubc_inv        : num  0 0 1 0 0 20 0 1 0 1 ...
##  $ an_ubc_inv        : num  472 488 490 450 462 460 488 484 490 410 ...
##  $ af_ubc_inv        : num  0 0 0.00204 0 0 ...
##  $ ac_cbc_fam_inv    : num  0 1 0 0 0 5 0 0 2 0 ...
##  $ an_cbc_fam_inv    : num  156 162 164 152 154 148 166 164 166 144 ...
##  $ af_cbc_fam_inv    : num  0 0.00617 0 0 0 ...
##  $ ac_cbc_nofam_inv  : num  1 1 0 1 1 9 1 0 0 0 ...
##  $ an_cbc_nofam_inv  : num  284 292 300 274 290 286 300 296 300 256 ...
##  $ af_cbc_nofam_inv  : num  0.00352 0.00342 0 0.00365 0.00345 ...
##  $ ac_ubc_fam_inv    : num  0 0 0 0 0 4 0 0 0 1 ...
##  $ an_ubc_fam_inv    : num  134 142 142 130 132 132 142 140 142 120 ...
##  $ af_ubc_fam_inv    : num  0 0 0 0 0 ...
##  $ ac_ubc_nofam_inv  : num  0 0 1 0 0 16 0 1 0 0 ...
##  $ an_ubc_nofam_inv  : num  338 346 348 320 330 328 346 344 348 290 ...
##  $ af_ubc_nofam_inv  : num  0 0 0.00287 0 0 ...
##  $ weight            : num  25 25 23.8 25 25 ...
variants.df[1:5,1:5]
##                SplitVarID SYMBOL TYPE CHROM    POS
## Var000000026 Var000000026  NOC2L  SNP     1 881871
## Var000000029 Var000000029  NOC2L  SNP     1 883883
## Var000000031 Var000000031  NOC2L  SNP     1 883946
## Var000000053 Var000000053 KLHL17  SNP     1 897287
## Var000000064 Var000000064 KLHL17  SNP     1 897792
dim(phenotypes.df)
## [1] 478  36
str(phenotypes.df)
## 'data.frame':    478 obs. of  36 variables:
##  $ wes_id        : chr  "P1_A01" "P1_A02" "P1_A03" "P1_A04" ...
##  $ gwas_id       : chr  "id203568" "id357807" "id325472" "id304354" ...
##  $ merged_id     : chr  "P1_A01_id203568" "P1_A02_id357807" "P1_A03_id325472" "P1_A04_id304354" ...
##  $ filter        : chr  "pass" "pass" "pass" "pass" ...
##  $ cc            : int  1 1 1 1 1 1 0 1 1 0 ...
##  $ setno         : int  203568 357807 325472 304354 222648 244843 276284 297810 250898 226974 ...
##  $ registry      : Factor w/ 5 levels "de","IA","IR",..: 3 4 2 2 5 4 2 4 4 2 ...
##  $ family_history: int  1 0 1 1 1 1 1 1 1 0 ...
##  $ age_dx        : int  49 35 32 33 44 28 28 38 35 36 ...
##  $ age_ref       : num  58 36 41 34 49 28 32 44 35 38 ...
##  $ rstime        : num  10.17 1.83 9.75 1.59 5.66 ...
##  $ eig1_gwas     : num  -0.00389 -0.00379 -0.01011 -0.01288 -0.01086 ...
##  $ eig2_gwas     : num  0.00266 0.00246 -0.0001 0.00595 0.01157 ...
##  $ eig3_gwas     : num  0.06803 0.05055 -0.00603 0.00747 0.00144 ...
##  $ eig4_gwas     : num  -0.03469 -0.01264 -0.01269 -0.01841 0.00663 ...
##  $ eig5_gwas     : num  -0.03845 -0.00424 0.01597 -0.0065 0.01391 ...
##  $ stage         : num  1 2 2 1 1 1 2 1 2 1 ...
##  $ er            : num  NA 0 0 0 NA 1 1 1 1 0 ...
##  $ pr            : num  NA 0 0 NA NA 1 NA 1 0 0 ...
##  $ hist_cat      : chr  "lobular" "ductal" "medullary" "ductal" ...
##  $ hormone       : num  0 0 0 0 1 0 0 0 0 0 ...
##  $ chemo_cat     : chr  "no" "CMF" "Oth" "no" ...
##  $ br_xray       : int  1 1 0 0 1 0 0 0 1 1 ...
##  $ br_xray_dose  : num  1.6 0.83 0 0 0.77 0 0 0 1.1 0.83 ...
##  $ age_menarche  : num  9 13 10 12 10 13 12 11 11 NA ...
##  $ age_1st_ftp   : num  30 0 26 0 17 0 25 28 27 18 ...
##  $ age_menopause : num  -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 ...
##  $ num_preg      : num  1 0 1 0 1 0 1 1 1 1 ...
##  $ bmi_age18     : num  20.8 22.5 23.6 18.6 19.9 ...
##  $ bmi_dx        : num  25.8 22.9 28.3 17.8 26.6 ...
##  $ bmi_ref       : num  33.3 22.9 33.1 17.8 29.8 ...
##  $ eig1          : num  -0.1566 -0.13939 -0.00225 -0.00914 0.01841 ...
##  $ eig2          : num  0.08741 0.06847 -0.00239 0.0008 -0.02139 ...
##  $ eig3          : num  -0.0185 -0.0117 -0.0185 -0.0713 -0.0101 ...
##  $ eig4          : num  -0.03333 0.03332 0.01482 0.00719 -0.01141 ...
##  $ eig5          : num  0.02116 0.09035 0.00663 0.01628 -0.02406 ...
phenotypes.df[1:5,1:5]
##        wes_id  gwas_id       merged_id filter cc
## P1_A01 P1_A01 id203568 P1_A01_id203568   pass  1
## P1_A02 P1_A02 id357807 P1_A02_id357807   pass  1
## P1_A03 P1_A03 id325472 P1_A03_id325472   pass  1
## P1_A04 P1_A04 id304354 P1_A04_id304354   pass  1
## P1_A05 P1_A05 id222648 P1_A05_id222648   pass  1
# Check consistency of rownames and colnames
sum(colnames(genotypes.mx) != rownames(phenotypes.df))
## [1] 0
sum(colnames(genes_aggr_data.mx) != rownames(phenotypes.df))
## [1] 0
sum(rownames(genes_aggr_info.df) != rownames(genes_aggr_data.mx))
## [1] 0
sum(rownames(genes_aggr_info.df) != rownames(genes_aggr_kgen.df))
## [1] 0
sum(rownames(genes_aggr_info.df) != rownames(genes_aggr_exac.df))
## [1] 0
# sum(rownames(genes_aggr_info.df) != rownames(Genes_aggr_skat_M.df))
# NB: - see comment before the next chunk

sum(rownames(genotypes.mx) != rownames(genotypes_inv_imp_wt.mx))
## [1] 0
sum(rownames(genotypes.mx) != rownames(kgen.df))
## [1] 0
sum(rownames(genotypes.mx) != rownames(exac.df))
## [1] 0
sum(rownames(genotypes.mx) != rownames(variants.df))
## [1] 0

sync_genes_aggr_files

Keep only genes present in SKAT-M

Genes_aggr_skat_M.df has different number of rows(genes) than other genes_aggr files !!! This is OK: 8 genes failed SKAT because of low call rate ( = high NA rate).
Our filtering used 80% call rate, while default SKAT used 85% NA rate filter.
In future I may change our filter to 85%, for consistency.

# Get SKAT-M genes
skat_m_genes <- rownames(genes_aggr_skat_M.df)

# Sync
genes_aggr_info.df <- genes_aggr_info.df[skat_m_genes,]
genes_aggr_data.mx <- genes_aggr_data.mx[skat_m_genes,]
genes_aggr_kgen.df <- genes_aggr_kgen.df[skat_m_genes,]
genes_aggr_exac.df <- genes_aggr_exac.df[skat_m_genes,]

# Clean-up
rm(skat_m_genes)

generate_aggregated_SKAT_M_exac_table

# Check burden and svt p values
summary(genes_aggr_skat_M.df$svt_is_accurate)
##    Mode    TRUE    NA's 
## logical    4688    4421
summary(genes_aggr_skat_M.df$burden_is_accurate)
##    Mode    TRUE    NA's 
## logical    4421    4688
# Merge burden and svt p values
svt_p <- genes_aggr_skat_M.df$svt_p
burden_p <- genes_aggr_skat_M.df$burden_p
burden_svt_p <- ifelse(is.na(svt_p),burden_p,svt_p)
summary(burden_svt_p) # no NA
##      Min.   1st Qu.    Median      Mean   3rd Qu.      Max. 
## 0.0004635 0.2270000 0.5397000 0.4987000 0.7361000 1.0000000
# Make aggregated SKAT_M_exac table
genes_aggr_skat_M_exac.df <- cbind(
  genes_aggr_skat_M.df[,c("gene", "num_var", "aggregated_MAC")],
  burden_svt_p,
  genes_aggr_exac.df[,c("inverted", "multiallelic", 
                        "ac_exac_NFE", "an_exac_NFE", "af_exac_NFE", 
                        "ac_ubc", "an_ubc", "af_ubc", 
                        "ac_cbc", "an_cbc", "af_cbc", 
                        "pearson_r", "trend_call", "crude_trend_test_p")])

# Clean-up
rm(svt_p, burden_p, burden_svt_p)

select_candidate_genes

# Explore genes_aggr_skat_M.df
nrow(genes_aggr_skat_M_exac.df)
## [1] 9109
summary(genes_aggr_skat_M_exac.df$trend_call)
## protective       risk  uncertain       NA's 
##       2992       3980        103       2034
hist(genes_aggr_skat_M_exac.df$burden_svt_p, 
     breaks=20, ylim=c(0,2000), labels=TRUE, 
     xlab="p-values", ylab="genes count", 
     main="Histogram of SKAT-M burden p-values")

# Note that expected p-values are not uniform!
# (see Lee et al Biostatistics 2016)

hist(genes_aggr_skat_M_exac.df$af_exac_NFE, 
     breaks=20, ylim=c(0,6000), labels=TRUE, 
     xlab="aggregated exac AF", ylab="genes count", 
     main="Histogram of aggregated exac AFs")

# Genes with exac data:
# "Rare" (exac <=0.05), "risk" genes with burden_p <= 0.05
selected_genes_with_exac.df <- genes_aggr_skat_M_exac.df[!is.na(genes_aggr_skat_M_exac.df$af_exac_NFE),]
selected_genes_with_exac.df <- selected_genes_with_exac.df[selected_genes_with_exac.df$burden_svt_p <= 0.05,]
selected_genes_with_exac.df <- selected_genes_with_exac.df[selected_genes_with_exac.df$af_exac_NFE <= 0.05,]
selected_genes_with_exac.df <- selected_genes_with_exac.df[selected_genes_with_exac.df$trend_call=="risk",]
selected_genes_with_exac.df <- selected_genes_with_exac.df[order(selected_genes_with_exac.df$burden_svt_p),]
dim(selected_genes_with_exac.df)
## [1] 83 18
# Genes without exac data: burden_p <= 0.05
selected_genes_no_exac.df <- genes_aggr_skat_M_exac.df[is.na(genes_aggr_skat_M_exac.df$trend_call),]
selected_genes_no_exac.df <- selected_genes_no_exac.df[selected_genes_no_exac.df$burden_svt_p <= 0.05,]
selected_genes_no_exac.df <- selected_genes_no_exac.df[order(selected_genes_no_exac.df$burden_svt_p),]
dim(selected_genes_no_exac.df)
## [1] 10 18
# Merge selected genes with and w/o exac data
selected_genes.df <- rbind(selected_genes_with_exac.df, selected_genes_no_exac.df)
selected_genes.df <- selected_genes.df[order(selected_genes.df$burden_svt_p),]
dim(selected_genes.df)
## [1] 93 18
# Print selected genes
selected_genes.df[,c("num_var","burden_svt_p", "af_exac_NFE", "af_ubc", "af_cbc", "crude_trend_test_p")]
##           num_var burden_svt_p  af_exac_NFE       af_ubc       af_cbc
## FAM171B         4  0.001368650 2.957000e-04 0.0096982759 0.0005630631
## EPHB2           2  0.004438192 3.075506e-03 0.0000000000 0.0068027211
## FOXM1           6  0.005854754 3.119452e-03 0.0046001415 0.0092387288
## TMEM134         1  0.007589152 6.606951e-03 0.0000000000 0.0115740741
## IQCA1           2  0.008472387 1.185144e-03 0.0000000000 0.0054704595
## GALNT12         5  0.010213508 2.526592e-03 0.0008176615 0.0043066322
## TIMELESS        7  0.010587495 2.624044e-04 0.0700541191 0.0643627140
## DBI             3  0.012421156 1.374419e-03 0.0000000000 0.0036179450
## TMEM139         1  0.012506083           NA 0.0103305785 0.0000000000
## CCDC27          4  0.013233760 1.567629e-04 0.0000000000 0.0026910657
## MYO1D           3  0.013250152 7.745648e-04 0.0000000000 0.0029154519
## ERCC6           6  0.013369314 7.560670e-04 0.0003465003 0.0025399129
## TROAP           4  0.013405362 7.079363e-04 0.0000000000 0.0032715376
## ECT2L          11  0.013495734 6.792698e-04 0.0005668934 0.0021964856
## ECM2            3  0.013579767 1.523398e-03 0.0000000000 0.0036764706
## DUOX1           5  0.013916339 1.178985e-03 0.0000000000 0.0021872266
## OR1S2           1  0.014844676           NA 0.0000000000 0.0066079295
## SH3RF2          1  0.016153907           NA 0.0000000000 0.0085836910
## DPP4            2  0.017842165 1.925356e-02 0.0132450331 0.0300925926
## MDM1            2  0.018886855 1.079157e-03 0.0000000000 0.0065789474
## ECM1            3  0.018935200 5.759923e-03 0.0013605442 0.0086330935
## REV3L           6  0.018944101 6.782671e-05 0.0000000000 0.0022075055
## PDE5A           1  0.019038427           NA 0.0153508772 0.0023255814
## LRRIQ3          4  0.019133259 6.893556e-04 0.0032467532 0.0085812357
## SLC4A3          4  0.019420337 1.746420e-04 0.0000000000 0.0022831050
## NOC3L           6  0.019943641 4.128891e-03 0.0017241379 0.0051132213
## SLC4A4          2  0.020456081 1.455926e-03 0.0000000000 0.0048543689
## RP9             1  0.021397689 1.017962e-02 0.0000000000 0.0129310345
## MEF2A           3  0.023086387 1.249017e-03 0.0000000000 0.0030165913
## SLC28A2         4  0.023450965 2.762227e-04 0.0000000000 0.0027995521
## GBGT1           1  0.023850873 5.520592e-03 0.0000000000 0.0086580087
## C8A             2  0.024934306 5.741838e-03 0.0010638298 0.0088888889
## C6orf201        1  0.025197315           NA 0.0877551020 0.0536480687
## TMCO4           5  0.025604399 1.953389e-03 0.0004237288 0.0026833631
## ITIH6           4  0.025965010 2.645236e-04 0.0000000000 0.0022547914
## ADAMTSL4        3  0.025986563 8.010253e-05 0.0127840909 0.0060331825
## RNF135          1  0.026109094           NA 0.0040983607 0.0214592275
## SCN5A           4  0.026326627 4.609230e-05 0.0000000000 0.0021786492
## RDX             4  0.026345468 1.484616e-04 0.0000000000 0.0021621622
## SERPIND1        1  0.027559056 1.784860e-03 0.0000000000 0.0086206897
## HOXD13          4  0.027629598 6.908749e-04 0.0000000000 0.0021528525
## METTL20         2  0.027958442 3.943568e-03 0.0011013216 0.0067720090
## SLC9A3R1        5  0.029060352 1.773017e-03 0.0004201681 0.0030946065
## ENTPD3          4  0.030180084 6.692208e-04 0.0000000000 0.0022296544
## LRIT3           1  0.030592502 2.787953e-02 0.0247933884 0.0504385965
## MRPL51          3  0.031142574 8.409263e-04 0.0020833333 0.0072463768
## SCRN3           4  0.032209984           NA 0.0851619645 0.0762527233
## CENPJ           2  0.032560822 1.410450e-03 0.0000000000 0.0032537961
## DEPTOR          2  0.032590560 1.289301e-04 0.0000000000 0.0033860045
## C7orf31         2  0.033264599 6.808229e-04 0.0295358650 0.0156950673
## WDR63           1  0.033343146           NA 0.0509259259 0.0857843137
## KRT12           3  0.033837477 1.632062e-03 0.0000000000 0.0021582734
## IGKC            3  0.034368191 1.238896e-03 0.0000000000 0.0021645022
## ITGA9           2  0.035439759 4.603119e-05 0.0000000000 0.0032188841
## RALGAPB         2  0.036508480 1.390142e-03 0.0000000000 0.0033112583
## C19orf59        1  0.036803160 4.697198e-03 0.0000000000 0.0064935065
## ZNF44           4  0.037896075 2.875276e-03 0.0010214505 0.0043196544
## DXO             1  0.038363244 5.668016e-03 0.0000000000 0.0107296137
## TCF24           1  0.039173952 6.772009e-03 0.0020491803 0.0129870130
## GYG1            2  0.039456483 1.089003e-03 0.0000000000 0.0053879310
## PLEKHG3         4  0.039529472 6.124381e-04 0.0000000000 0.0026968716
## STK11IP         4  0.040193202 9.105448e-03 0.0067920585 0.0137969095
## FARS2           3  0.040703069 2.039613e-03 0.0000000000 0.0037147103
## OR52K1          2  0.041547109 1.509434e-03 0.0000000000 0.0055928412
## NRG1            2  0.041964650 1.841688e-05 0.0000000000 0.0034324943
## GALT            3  0.042044605 1.150039e-03 0.0000000000 0.0021551724
## CYP2A13         2  0.042085843 5.796835e-04 0.0000000000 0.0032188841
## SVOPL           5  0.042100979 2.654720e-03 0.0480295567 0.0520833333
## GPR87           2  0.042160022 1.108525e-04 0.0000000000 0.0033333333
## NF1             3  0.042363028 5.520389e-05 0.0000000000 0.0021645022
## PPAT            1  0.042745150 2.576561e-03 0.0000000000 0.0107758621
## P2RX2           1  0.042958560 2.177765e-03 0.0000000000 0.0066371681
## RAX2            2  0.043039110 1.266702e-02 0.0088495575 0.0214797136
## ARAP1           5  0.043136222 4.299701e-05 0.0020475020 0.0004347826
## S100Z           1  0.043139506           NA 0.0000000000 0.0065502183
## TIMM21          3  0.043252002 4.203315e-04 0.0000000000 0.0036023055
## SH2B2           2  0.043273371 7.375241e-03 0.0271149675 0.0417607223
## IQCE            2  0.043608698 1.167078e-02 0.0040816327 0.0118790497
## GPR133          2  0.043892368 1.662771e-02 0.0103092784 0.0231277533
## ACOX3           3  0.044599960 4.725005e-05 0.0000000000 0.0021645022
## DCLRE1B         2  0.044863978 3.680259e-05 0.0000000000 0.0021645022
## CACNA1H         5  0.044938450 1.301784e-03 0.0000000000 0.0031418312
## UNC5A           2  0.045014013 1.861339e-03 0.0000000000 0.0034642032
## EIF4ENIF1       3  0.045114687           NA 0.0000000000 0.0021520803
## CC2D1B          4  0.047255323 0.000000e+00 0.0000000000 0.0022446689
## APLP2           3  0.047439268 1.472890e-04 0.0000000000 0.0021645022
## ALG6            4  0.047926587 1.449723e-02 0.0098855359 0.0168478261
## FREM2           2  0.048533489 3.964961e-04 0.0000000000 0.0032467532
## OR4X2           3  0.048881780 3.681207e-05 0.0000000000 0.0022123894
## OR5D18          2  0.049182573 3.350022e-02 0.0224489796 0.0378787879
## MYO9B           4  0.049238885 8.759907e-04 0.0010351967 0.0033039648
## BIN3            2  0.049250431 3.702744e-04 0.0000000000 0.0032822757
## EGFLAM          7  0.049359636 6.470041e-04 0.0003021148 0.0022194039
##           crude_trend_test_p
## FAM171B         2.151509e-18
## EPHB2           3.220092e-01
## FOXM1           3.023015e-08
## TMEM134         7.480053e-01
## IQCA1           4.919816e-03
## GALNT12         4.486884e-01
## TIMELESS        0.000000e+00
## DBI             1.775266e-01
## TMEM139                   NA
## CCDC27          2.848629e-11
## MYO1D           4.295755e-02
## ERCC6           8.627454e-03
## TROAP           2.090565e-03
## ECT2L           5.266030e-04
## ECM2            2.574508e-01
## DUOX1           6.170625e-01
## OR1S2                     NA
## SH3RF2                    NA
## DPP4            1.435035e-01
## MDM1            7.335189e-05
## ECM1            7.964537e-01
## REV3L           7.679104e-22
## PDE5A                     NA
## LRRIQ3          5.237513e-31
## SLC4A3          8.998370e-08
## NOC3L           8.458125e-01
## SLC4A4          7.893069e-02
## RP9             6.265937e-01
## MEF2A           3.068141e-01
## SLC28A2         2.843605e-07
## GBGT1           9.492073e-01
## C8A             7.882713e-01
## C6orf201                  NA
## TMCO4           9.543098e-01
## ITIH6           7.863809e-05
## ADAMTSL4       7.219679e-109
## RNF135                    NA
## SCN5A           6.236184e-19
## RDX             4.536315e-07
## SERPIND1        8.375230e-03
## HOXD13          1.128662e-01
## METTL20         5.545568e-01
## SLC9A3R1        5.295468e-01
## ENTPD3          7.947945e-02
## LRIT3           2.364205e-02
## MRPL51          4.104889e-14
## SCRN3                     NA
## CENPJ           4.273692e-01
## DEPTOR          1.276417e-11
## C7orf31         3.861723e-98
## WDR63                     NA
## KRT12           7.902473e-01
## IGKC            7.949823e-01
## ITGA9           1.425096e-23
## RALGAPB         3.905950e-01
## C19orf59        8.627859e-01
## ZNF44           7.290773e-01
## DXO             5.890828e-01
## TCF24           3.585680e-01
## GYG1            2.583234e-03
## PLEKHG3         7.809027e-03
## STK11IP         1.673924e-01
## FARS2           6.613277e-01
## OR52K1          2.644906e-02
## NRG1            7.777956e-26
## GALT            6.895987e-01
## CYP2A13         9.656618e-03
## SVOPL           0.000000e+00
## GPR87           4.326746e-11
## NF1             3.295114e-11
## PPAT            1.060213e-02
## P2RX2           1.784944e-01
## RAX2            1.218281e-01
## ARAP1           6.051181e-12
## S100Z                     NA
## TIMM21          3.560414e-06
## SH2B2           2.272441e-38
## IQCE            3.451536e-01
## GPR133          5.092634e-01
## ACOX3           1.127528e-15
## DCLRE1B         2.094227e-11
## CACNA1H         1.811436e-01
## UNC5A           6.958668e-01
## EIF4ENIF1                 NA
## CC2D1B          9.907819e-09
## APLP2           1.041983e-06
## ALG6            9.931371e-01
## FREM2           4.899878e-04
## OR4X2           9.299060e-14
## OR5D18          8.334446e-01
## MYO9B           1.658507e-03
## BIN3            3.608443e-04
## EGFLAM          6.818402e-03

explore_and_save_selected_genes

# Explore selected genes
dim(selected_genes.df) # 93 x 18
## [1] 93 18
summary(selected_genes.df$inverted)
##    Mode   FALSE    TRUE    NA's 
## logical      90       3       0
summary(selected_genes.df$multiallelic)
##    Mode   FALSE    TRUE    NA's 
## logical      90       3       0
hist(selected_genes.df$num_var, 
     labels=TRUE, xlim=c(0,12), xlab="num of variants", ylim=c(0,60), ylab="genes count", 
     main="Histogram of numbers of variants in selected genes")

hist(selected_genes.df$aggregated_MAC,
     labels=TRUE, xlab="aggregated MAC", ylim=c(0,100), ylab="genes count", 
     main="Histogram of aggregated MAC in selected genes")

hist(selected_genes.df$aggregated_MAC[selected_genes.df$aggregated_MAC < 50],
     labels=TRUE, breaks = 46, 
     xlim=c(0,50), xlab="aggregated MAC", ylim=c(0,25), ylab="genes count", 
     main="Histogram of aggregated MAC in selected genes (zoom < 50)")

hist(selected_genes.df$af_exac_NFE, 
     labels=TRUE, xlim=c(0,0.05), xlab="aggregated exac AF", ylim=c(0,80), ylab="genes count", 
     main="Histogram of aggregated exac AF in selected genes")

# Write result table to text file
results_file <- paste(base_folder, "results", "r10c_SKAT_M_and_crude_trends.txt", sep="/")
write.table(selected_genes.df, file=results_file, quote=FALSE, sep="\t")

# Clean-up
rm(results_file)

plot_genes_with_exac_data

genes <- rownames(selected_genes_with_exac.df) 
length(genes) # 83
## [1] 83
for(gene in genes){
  
  #gene <- "FOXM1"
  
  x <- as.numeric(selected_genes_with_exac.df[gene, c("af_exac_NFE", "af_ubc", "af_cbc"), drop=TRUE])
  
  exac_counts <- (selected_genes_with_exac.df[gene, c("ac_exac_NFE", "an_exac_NFE"), drop=TRUE])
  exac_counts <- paste(exac_counts, collapse = "/")
  exac_counts <- paste(round(x[1],5), " (", exac_counts, ")", sep="")
  
  ubc_counts <- (selected_genes_with_exac.df[gene, c("ac_ubc", "an_ubc"), drop=TRUE])
  ubc_counts <- paste(ubc_counts, collapse = "/")
  ubc_counts <- paste(round(x[2],5), " (", ubc_counts, ")", sep="")

  cbc_counts <- (selected_genes_with_exac.df[gene, c("ac_cbc", "an_cbc"), drop=TRUE])
  cbc_counts <- paste(cbc_counts, collapse = "/")
  cbc_counts <- paste(round(x[3],5), " (", cbc_counts, ")", sep="")
  
  counts <- c(exac_counts, ubc_counts, cbc_counts)
  
  p <- barplot(x, names=c("af_exac_NFE", "af_ubc", "af_cbc"), 
          ylim=c(0, 1.1*max(x)), ylab="aggregated AF", 
          main=paste(gene,"\ncrude counts"))

  text(p, x, labels=counts, pos=3, offset=.5)  
  
}

rm(gene, genes, x, p, exac_counts, ubc_counts, cbc_counts, counts, selected_genes_with_exac.df)

plot_genes_with_no_exac_data

genes <- rownames(selected_genes_no_exac.df) 
length(genes) # 10
## [1] 10
for(gene in genes){
  
  #gene <- "TMEM139"
  
  x <- as.numeric(selected_genes_no_exac.df[gene, c("af_ubc", "af_cbc"), drop=TRUE])
  
  ubc_counts <- (selected_genes_no_exac.df[gene, c("ac_ubc", "an_ubc"), drop=TRUE])
  ubc_counts <- paste(ubc_counts, collapse = "/")
  ubc_counts <- paste(round(x[1],5), " (", ubc_counts, ")", sep="")

  cbc_counts <- (selected_genes_no_exac.df[gene, c("ac_cbc", "an_cbc"), drop=TRUE])
  cbc_counts <- paste(cbc_counts, collapse = "/")
  cbc_counts <- paste(round(x[2],5), " (", cbc_counts, ")", sep="")
  
  counts <- c(ubc_counts, cbc_counts)
  
  p <- barplot(x, names=c("af_ubc", "af_cbc"), 
          ylim=c(0, 1.1*max(x)), ylab="aggregated AF", 
          main=paste(gene,"\ncrude counts"))

  text(p, x, labels=counts, pos=3, offset=.5)  
  
}

rm(gene, genes, x, p, ubc_counts, cbc_counts, counts, selected_genes_no_exac.df)

explore_preselected_candidate_genes

Assuming that candidates have exac data

candidates <- c("FOXM1", "SLC9A3R1", "ACACA", 
                "ATM", "CHEK2", "FANCB", 
                "EPHB2", "TIMELESS", "ERCC6", "REV3L", "PDK4", "HDAC6", 
                "TLR5", "IGKC", "THBS4", "EID3", "AKAP13", "NRG1", "PLK3", "INCENP", "NF1", "CHRNA9")

genes_aggr_skat_M_exac.df[candidates, c("af_exac_NFE", "af_ubc", "af_cbc", "trend_call", "burden_svt_p")]
##           af_exac_NFE       af_ubc      af_cbc trend_call burden_svt_p
## FOXM1    3.119452e-03 0.0046001415 0.009238729       risk  0.005854754
## SLC9A3R1 1.773017e-03 0.0004201681 0.003094607       risk  0.029060352
## ACACA    1.840130e-05 0.0005122951 0.001628664       risk  0.231834803
## ATM      6.219315e-04 0.0014744913 0.002162496       risk  0.265714946
## CHEK2    4.028686e-04 0.0014611338 0.002770936       risk  0.135053181
## FANCB    8.716822e-02 0.0857740586 0.111842105       risk  0.193727742
## EPHB2    3.075506e-03 0.0000000000 0.006802721       risk  0.004438192
## TIMELESS 2.624044e-04 0.0700541191 0.064362714       risk  0.010587495
## ERCC6    7.560670e-04 0.0003465003 0.002539913       risk  0.013369314
## REV3L    6.782671e-05 0.0000000000 0.002207506       risk  0.018944101
## PDK4     6.978547e-03 0.0126582278 0.000000000 protective  0.020845472
## HDAC6    3.441264e-03 0.0081632653 0.000000000 protective  0.077788594
## TLR5     7.060521e-02 0.0761904762 0.059798271 protective  0.033865983
## IGKC     1.238896e-03 0.0000000000 0.002164502       risk  0.034368191
## THBS4    1.624306e-03 0.0051546392 0.000000000 protective  0.042211267
## EID3     1.880256e-03 0.0051020408 0.000000000 protective  0.042893414
## AKAP13   2.515029e-04 0.0025614754 0.000000000 protective  0.043316528
## NRG1     1.841688e-05 0.0000000000 0.003432494       risk  0.041964650
## PLK3     1.478524e-04 0.0000000000 0.002380952       risk  0.050357879
## INCENP   3.171559e-03 0.0061728395 0.000000000 protective  0.040107295
## NF1      5.520389e-05 0.0000000000 0.002164502       risk  0.042363028
## CHRNA9   6.198220e-04 0.0035014006 0.000000000 protective  0.044378320
for(gene in candidates){
  
  #gene <- "FOXM1"
  #gene <- "TMEM139" # no exac data
  
  # Skip gene if it does not have exac data
  if(is.na(genes_aggr_skat_M_exac.df[gene,"af_exac_NFE"])) {
    print(paste(gene, "has no exac data"))
    next
  }
  
  x <- as.numeric(genes_aggr_skat_M_exac.df[gene, c("af_exac_NFE", "af_ubc", "af_cbc"), drop=TRUE])
  
  exac_counts <- (genes_aggr_skat_M_exac.df[gene, c("ac_exac_NFE", "an_exac_NFE"), drop=TRUE])
  exac_counts <- paste(exac_counts, collapse = "/")
  exac_counts <- paste(round(x[1],5), " (", exac_counts, ")", sep="")
  
  ubc_counts <- (genes_aggr_skat_M_exac.df[gene, c("ac_ubc", "an_ubc"), drop=TRUE])
  ubc_counts <- paste(ubc_counts, collapse = "/")
  ubc_counts <- paste(round(x[2],5), " (", ubc_counts, ")", sep="")

  cbc_counts <- (genes_aggr_skat_M_exac.df[gene, c("ac_cbc", "an_cbc"), drop=TRUE])
  cbc_counts <- paste(cbc_counts, collapse = "/")
  cbc_counts <- paste(round(x[3],5), " (", cbc_counts, ")", sep="")
  
  counts <- c(exac_counts, ubc_counts, cbc_counts)
  
  p <- barplot(x, names=c("af_exac_NFE", "af_ubc", "af_cbc"), 
          ylim=c(0, 1.1*max(x)), ylab="aggregated AF", 
          main=paste(gene,"\ncrude counts"))

  text(p, x, labels=counts, pos=3, offset=.5)  
  
}

rm(gene, candidates, x, p, exac_counts, ubc_counts, cbc_counts, counts)

data_summary

dim(genotypes.mx)
## [1] 18551   478
class(genotypes.mx)
## [1] "matrix"
genotypes.mx[1:5,1:5]
##              P1_A01 P1_A02 P1_A03 P1_A04 P1_A05
## Var000000026      0     NA      0      0      0
## Var000000029      0      0      0      0      0
## Var000000031      0      0      0      0      0
## Var000000053      0      0      0      0     NA
## Var000000064      0      0      0      0     NA
dim(genotypes_inv_imp_wt.mx)
## [1] 18551   478
class(genotypes_inv_imp_wt.mx)
## [1] "matrix"
genotypes_inv_imp_wt.mx[1:5,1:5]
##              P1_A01     P1_A02 P1_A03 P1_A04     P1_A05
## Var000000026      0 0.05482456      0      0 0.00000000
## Var000000029      0 0.00000000      0      0 0.00000000
## Var000000031      0 0.00000000      0      0 0.00000000
## Var000000053      0 0.00000000      0      0 0.05707763
## Var000000064      0 0.00000000      0      0 0.05518764
dim(genes_aggr_data.mx)
## [1] 9109  478
class(genes_aggr_data.mx)
## [1] "matrix"
genes_aggr_data.mx[1:5,1:5]
##        P1_A01     P1_A02     P1_A03   P1_A04    P1_A05
## NOC2L       0 0.05482456 0.00000000 0.000000 0.0000000
## KLHL17      0 0.00000000 0.00000000 0.000000 0.1122653
## AGRN        0 0.65431621 0.00000000 8.602334 0.0000000
## TTLL10      0 0.00000000 0.00000000 0.000000 0.0000000
## PUSL1       0 0.05821459 0.05821459 0.000000 0.0000000
dim(genes_aggr_skat_M.df)
## [1] 9109   12
str(genes_aggr_skat_M.df)
## 'data.frame':    9109 obs. of  12 variables:
##  $ gene              : chr  "NOC2L" "KLHL17" "AGRN" "TTLL10" ...
##  $ num_var           : int  3 2 3 1 1 1 1 1 2 1 ...
##  $ svt_p             : num  NA NA NA 0.346 0.711 ...
##  $ svt_is_accurate   : logi  NA NA NA TRUE TRUE TRUE ...
##  $ svt_map           : num  NA NA NA 0.0912 0.2106 ...
##  $ burden_p          : num  0.533 0.343 0.431 NA NA ...
##  $ burden_is_accurate: logi  TRUE TRUE TRUE NA NA NA ...
##  $ burden_map        : num  0.0135 0.0871 -1 NA NA ...
##  $ skat_p            : num  0.465 0.831 0.344 NA NA ...
##  $ skat_is_accurate  : logi  TRUE TRUE TRUE NA NA NA ...
##  $ skat_map          : num  0.0118 0.0871 -1 NA NA ...
##  $ aggregated_MAC    : num  4 2 36 2 1 1 1 41 3 1 ...
genes_aggr_skat_M.df[1:5,1:5]
##          gene num_var     svt_p svt_is_accurate    svt_map
## NOC2L   NOC2L       3        NA              NA         NA
## KLHL17 KLHL17       2        NA              NA         NA
## AGRN     AGRN       3        NA              NA         NA
## TTLL10 TTLL10       1 0.3459301            TRUE 0.09120194
## PUSL1   PUSL1       1 0.7106340            TRUE 0.21063399
dim(genes_aggr_skat_M_exac.df)
## [1] 9109   18
str(genes_aggr_skat_M_exac.df)
## 'data.frame':    9109 obs. of  18 variables:
##  $ gene              : chr  "NOC2L" "KLHL17" "AGRN" "TTLL10" ...
##  $ num_var           : int  3 2 3 1 1 1 1 1 2 1 ...
##  $ aggregated_MAC    : num  4 2 36 2 1 1 1 41 3 1 ...
##  $ burden_svt_p      : num  0.533 0.343 0.431 0.346 0.711 ...
##  $ inverted          : logi  FALSE FALSE FALSE FALSE FALSE FALSE ...
##  $ multiallelic      : logi  FALSE FALSE FALSE FALSE FALSE FALSE ...
##  $ ac_exac_NFE       : num  14 2 2193 1 32 ...
##  $ an_exac_NFE       : num  108598 108618 74336 4760 54314 ...
##  $ af_exac_NFE       : num  1.29e-04 1.84e-05 2.95e-02 2.10e-04 5.89e-04 ...
##  $ ac_ubc            : num  1 0 21 0 1 0 0 21 1 1 ...
##  $ an_ubc            : num  1450 912 1432 490 410 ...
##  $ af_ubc            : num  0.00069 0 0.01466 0 0.00244 ...
##  $ ac_cbc            : num  3 2 15 2 0 1 1 20 2 0 ...
##  $ an_cbc            : num  1358 870 1360 466 400 ...
##  $ af_cbc            : num  0.00221 0.0023 0.01103 0.00429 0 ...
##  $ pearson_r         : num  0.966 0.863 -0.944 0.844 -0.231 ...
##  $ trend_call        : Factor w/ 3 levels "protective","risk",..: 2 2 1 2 1 2 2 2 2 1 ...
##  $ crude_trend_test_p: num  1.28e-09 6.31e-23 4.40e-07 1.49e-03 8.00e-01 ...
genes_aggr_skat_M_exac.df[1:5,1:5]
##          gene num_var aggregated_MAC burden_svt_p inverted
## NOC2L   NOC2L       3              4    0.5333523    FALSE
## KLHL17 KLHL17       2              2    0.3433343    FALSE
## AGRN     AGRN       3             36    0.4308774    FALSE
## TTLL10 TTLL10       1              2    0.3459301    FALSE
## PUSL1   PUSL1       1              1    0.7106340    FALSE
dim(selected_genes.df)
## [1] 93 18
str(selected_genes.df)
## 'data.frame':    93 obs. of  18 variables:
##  $ gene              : chr  "FAM171B" "EPHB2" "FOXM1" "TMEM134" ...
##  $ num_var           : int  4 2 6 1 2 5 7 3 1 4 ...
##  $ aggregated_MAC    : num  19 6 38 5 5 12 436 5 5 5 ...
##  $ burden_svt_p      : num  0.00137 0.00444 0.00585 0.00759 0.00847 ...
##  $ inverted          : logi  FALSE FALSE FALSE FALSE FALSE FALSE ...
##  $ multiallelic      : logi  FALSE FALSE FALSE FALSE FALSE FALSE ...
##  $ ac_exac_NFE       : num  31 334 492 357 118 686 57 224 NA 17 ...
##  $ an_exac_NFE       : num  104836 108600 157720 54034 99566 ...
##  $ af_exac_NFE       : num  0.000296 0.003076 0.003119 0.006607 0.001185 ...
##  $ ac_ubc            : num  18 0 13 0 0 2 233 0 5 0 ...
##  $ an_ubc            : num  1856 950 2826 470 974 ...
##  $ af_ubc            : num  0.0097 0 0.0046 0 0 ...
##  $ ac_cbc            : num  1 6 25 5 5 10 203 5 0 5 ...
##  $ an_cbc            : num  1776 882 2706 432 914 ...
##  $ af_cbc            : num  0.000563 0.006803 0.009239 0.011574 0.00547 ...
##  $ pearson_r         : num  0.025 0.547 0.958 0.428 0.745 ...
##  $ trend_call        : Factor w/ 3 levels "protective","risk",..: 2 2 2 2 2 2 2 2 NA 2 ...
##  $ crude_trend_test_p: num  2.15e-18 3.22e-01 3.02e-08 7.48e-01 4.92e-03 ...
selected_genes.df[1:5,1:5]
##            gene num_var aggregated_MAC burden_svt_p inverted
## FAM171B FAM171B       4             19  0.001368650    FALSE
## EPHB2     EPHB2       2              6  0.004438192    FALSE
## FOXM1     FOXM1       6             38  0.005854754    FALSE
## TMEM134 TMEM134       1              5  0.007589152    FALSE
## IQCA1     IQCA1       2              5  0.008472387    FALSE
dim(genes_aggr_info.df)
## [1] 9109   27
str(genes_aggr_info.df)
## 'data.frame':    9109 obs. of  27 variables:
##  $ gene             : chr  "NOC2L" "KLHL17" "AGRN" "TTLL10" ...
##  $ num_var          : num  3 2 3 1 1 1 1 1 2 1 ...
##  $ inverted         : logi  FALSE FALSE FALSE FALSE FALSE FALSE ...
##  $ multiallelic     : logi  FALSE FALSE FALSE FALSE FALSE FALSE ...
##  $ aggr_ac          : num  4 2 36 2 1 1 1 41 3 1 ...
##  $ aggr_an          : num  2808 1782 2792 956 810 ...
##  $ aggr_af          : num  0.00142 0.00112 0.01289 0.00209 0.00123 ...
##  $ aggr_ac_cbc      : num  3 2 15 2 0 1 1 20 2 0 ...
##  $ aggr_an_cbc      : num  1358 870 1360 466 400 ...
##  $ aggr_af_cbc      : num  0.00221 0.0023 0.01103 0.00429 0 ...
##  $ aggr_ac_ubc      : num  1 0 21 0 1 0 0 21 1 1 ...
##  $ aggr_an_ubc      : num  1450 912 1432 490 410 ...
##  $ aggr_af_ubc      : num  0.00069 0 0.01466 0 0.00244 ...
##  $ aggr_ac_cbc_fam  : num  1 0 5 2 0 0 1 9 0 0 ...
##  $ aggr_an_cbc_fam  : num  482 306 478 166 144 162 164 164 318 150 ...
##  $ aggr_af_cbc_fam  : num  0.00207 0 0.01046 0.01205 0 ...
##  $ aggr_ac_cbc_nofam: num  2 2 10 0 0 1 0 11 2 0 ...
##  $ aggr_an_cbc_nofam: num  876 564 882 300 256 298 296 300 566 290 ...
##  $ aggr_af_cbc_nofam: num  0.00228 0.00355 0.01134 0 0 ...
##  $ aggr_ac_ubc_fam  : num  0 0 4 0 1 0 0 8 0 1 ...
##  $ aggr_an_ubc_fam  : num  418 262 414 142 120 140 142 142 278 132 ...
##  $ aggr_af_ubc_fam  : num  0 0 0.00966 0 0.00833 ...
##  $ aggr_ac_ubc_nofam: num  1 0 17 0 0 0 0 13 1 0 ...
##  $ aggr_an_ubc_nofam: num  1032 650 1018 348 290 ...
##  $ aggr_af_ubc_nofam: num  0.000969 0 0.016699 0 0 ...
##  $ cbc_ubc_call     : Factor w/ 3 levels "protective","risk",..: 2 2 1 2 1 2 2 2 2 1 ...
##  $ cbc_ubc_fisher_p : num  0.359 0.238 0.408 0.237 1 ...
genes_aggr_info.df[1:5,1:5]
##          gene num_var inverted multiallelic aggr_ac
## NOC2L   NOC2L       3    FALSE        FALSE       4
## KLHL17 KLHL17       2    FALSE        FALSE       2
## AGRN     AGRN       3    FALSE        FALSE      36
## TTLL10 TTLL10       1    FALSE        FALSE       2
## PUSL1   PUSL1       1    FALSE        FALSE       1
dim(genes_aggr_kgen.df)
## [1] 9109   16
str(genes_aggr_kgen.df)
## 'data.frame':    9109 obs. of  16 variables:
##  $ gene              : chr  "NOC2L" "KLHL17" "AGRN" "TTLL10" ...
##  $ num_var           : num  3 2 3 1 1 1 1 1 2 1 ...
##  $ inverted          : logi  FALSE FALSE FALSE FALSE FALSE FALSE ...
##  $ multiallelic      : logi  FALSE FALSE FALSE FALSE FALSE FALSE ...
##  $ ac_kgen           : num  2 NA 92 NA 6 NA 4 371 9 NA ...
##  $ an_kgen           : num  10016 NA 5008 NA 5008 ...
##  $ af_kgen           : num  0.0002 NA 0.0184 NA 0.0012 ...
##  $ ac_ubc            : num  1 0 21 0 1 0 0 21 1 1 ...
##  $ an_ubc            : num  1450 912 1432 490 410 ...
##  $ af_ubc            : num  0.00069 0 0.01466 0 0.00244 ...
##  $ ac_cbc            : num  3 2 15 2 0 1 1 20 2 0 ...
##  $ an_cbc            : num  1358 870 1360 466 400 ...
##  $ af_cbc            : num  0.00221 0.0023 0.01103 0.00429 0 ...
##  $ pearson_r         : num  0.959 NA -1 NA -0.491 ...
##  $ trend_call        : Factor w/ 3 levels "protective","risk",..: 2 NA 1 NA 1 NA 2 1 2 NA ...
##  $ crude_trend_test_p: num  0.00168 NA 0.04823 NA 0.75384 ...
genes_aggr_kgen.df[1:5,1:5]
##          gene num_var inverted multiallelic ac_kgen
## NOC2L   NOC2L       3    FALSE        FALSE       2
## KLHL17 KLHL17       2    FALSE        FALSE      NA
## AGRN     AGRN       3    FALSE        FALSE      92
## TTLL10 TTLL10       1    FALSE        FALSE      NA
## PUSL1   PUSL1       1    FALSE        FALSE       6
dim(genes_aggr_exac.df)
## [1] 9109   16
str(genes_aggr_exac.df)
## 'data.frame':    9109 obs. of  16 variables:
##  $ gene              : chr  "NOC2L" "KLHL17" "AGRN" "TTLL10" ...
##  $ num_var           : num  3 2 3 1 1 1 1 1 2 1 ...
##  $ inverted          : logi  FALSE FALSE FALSE FALSE FALSE FALSE ...
##  $ multiallelic      : logi  FALSE FALSE FALSE FALSE FALSE FALSE ...
##  $ ac_exac_NFE       : num  14 2 2193 1 32 ...
##  $ an_exac_NFE       : num  108598 108618 74336 4760 54314 ...
##  $ af_exac_NFE       : num  1.29e-04 1.84e-05 2.95e-02 2.10e-04 5.89e-04 ...
##  $ ac_ubc            : num  1 0 21 0 1 0 0 21 1 1 ...
##  $ an_ubc            : num  1450 912 1432 490 410 ...
##  $ af_ubc            : num  0.00069 0 0.01466 0 0.00244 ...
##  $ ac_cbc            : num  3 2 15 2 0 1 1 20 2 0 ...
##  $ an_cbc            : num  1358 870 1360 466 400 ...
##  $ af_cbc            : num  0.00221 0.0023 0.01103 0.00429 0 ...
##  $ pearson_r         : num  0.966 0.863 -0.944 0.844 -0.231 ...
##  $ trend_call        : Factor w/ 3 levels "protective","risk",..: 2 2 1 2 1 2 2 2 2 1 ...
##  $ crude_trend_test_p: num  1.28e-09 6.31e-23 4.40e-07 1.49e-03 8.00e-01 ...
genes_aggr_exac.df[1:5,1:5]
##          gene num_var inverted multiallelic ac_exac_NFE
## NOC2L   NOC2L       3    FALSE        FALSE          14
## KLHL17 KLHL17       2    FALSE        FALSE           2
## AGRN     AGRN       3    FALSE        FALSE        2193
## TTLL10 TTLL10       1    FALSE        FALSE           1
## PUSL1   PUSL1       1    FALSE        FALSE          32
dim(kgen.df)
## [1] 18551     9
colnames(kgen.df)
## [1] "SplitVarID"  "kgen.AC"     "kgen.AN"     "kgen.AF"     "kgen.AFR_AF"
## [6] "kgen.AMR_AF" "kgen.EAS_AF" "kgen.EUR_AF" "kgen.SAS_AF"
kgen.df[1:5,1:5]
##                SplitVarID kgen.AC kgen.AN     kgen.AF kgen.AFR_AF
## Var000000026 Var000000026      NA      NA          NA          NA
## Var000000029 Var000000029       1    5008 0.000199681           0
## Var000000031 Var000000031       1    5008 0.000199681           0
## Var000000053 Var000000053      NA      NA          NA          NA
## Var000000064 Var000000064      NA      NA          NA          NA
dim(exac.df)
## [1] 18551    48
colnames(exac.df)
##  [1] "SplitVarID"              "exac_non_TCGA.AF"       
##  [3] "exac_non_TCGA.AC"        "exac_non_TCGA.AN"       
##  [5] "exac_non_TCGA.AC_FEMALE" "exac_non_TCGA.AN_FEMALE"
##  [7] "exac_non_TCGA.AC_MALE"   "exac_non_TCGA.AN_MALE"  
##  [9] "exac_non_TCGA.AC_Adj"    "exac_non_TCGA.AN_Adj"   
## [11] "exac_non_TCGA.AC_Hom"    "exac_non_TCGA.AC_Het"   
## [13] "exac_non_TCGA.AC_Hemi"   "exac_non_TCGA.AC_AFR"   
## [15] "exac_non_TCGA.AN_AFR"    "exac_non_TCGA.Hom_AFR"  
## [17] "exac_non_TCGA.Het_AFR"   "exac_non_TCGA.Hemi_AFR" 
## [19] "exac_non_TCGA.AC_AMR"    "exac_non_TCGA.AN_AMR"   
## [21] "exac_non_TCGA.Hom_AMR"   "exac_non_TCGA.Het_AMR"  
## [23] "exac_non_TCGA.Hemi_AMR"  "exac_non_TCGA.AC_EAS"   
## [25] "exac_non_TCGA.AN_EAS"    "exac_non_TCGA.Hom_EAS"  
## [27] "exac_non_TCGA.Het_EAS"   "exac_non_TCGA.Hemi_EAS" 
## [29] "exac_non_TCGA.AC_FIN"    "exac_non_TCGA.AN_FIN"   
## [31] "exac_non_TCGA.Hom_FIN"   "exac_non_TCGA.Het_FIN"  
## [33] "exac_non_TCGA.Hemi_FIN"  "exac_non_TCGA.AC_NFE"   
## [35] "exac_non_TCGA.AN_NFE"    "exac_non_TCGA.Hom_NFE"  
## [37] "exac_non_TCGA.Het_NFE"   "exac_non_TCGA.Hemi_NFE" 
## [39] "exac_non_TCGA.AC_SAS"    "exac_non_TCGA.AN_SAS"   
## [41] "exac_non_TCGA.Hom_SAS"   "exac_non_TCGA.Het_SAS"  
## [43] "exac_non_TCGA.Hemi_SAS"  "exac_non_TCGA.AC_OTH"   
## [45] "exac_non_TCGA.AN_OTH"    "exac_non_TCGA.Hom_OTH"  
## [47] "exac_non_TCGA.Het_OTH"   "exac_non_TCGA.Hemi_OTH"
exac.df[1:5,1:5]
##                SplitVarID exac_non_TCGA.AF exac_non_TCGA.AC
## Var000000026 Var000000026               NA               NA
## Var000000029 Var000000029        1.506e-04               16
## Var000000031 Var000000031        2.354e-04               25
## Var000000053 Var000000053        1.883e-05                2
## Var000000064 Var000000064        9.416e-06                1
##              exac_non_TCGA.AN exac_non_TCGA.AC_FEMALE
## Var000000026               NA                      NA
## Var000000029           106210                       8
## Var000000031           106210                       4
## Var000000053           106210                       0
## Var000000064           106206                       0
dim(variants.df)
## [1] 18551    67
str(variants.df)
## 'data.frame':    18551 obs. of  67 variables:
##  $ SplitVarID        : Factor w/ 343824 levels "Var000000001",..: 26 29 31 53 64 167 227 237 300 400 ...
##  $ SYMBOL            : Factor w/ 19862 levels "A1BG","A1CF",..: 11149 11149 11149 8665 8665 540 540 540 18233 13545 ...
##  $ TYPE              : Factor w/ 2 levels "INDEL","SNP": 2 2 2 2 2 2 2 2 2 2 ...
##  $ CHROM             : Factor w/ 25 levels "1","10","11",..: 1 1 1 1 1 1 1 1 1 1 ...
##  $ POS               : int  881871 883883 883946 897287 897792 979748 987173 989899 1132513 1246373 ...
##  $ REF               : Factor w/ 2142 levels "A","AAAAAAAAAAAAAAAAAAAAAAAG",..: 1044 1575 539 539 539 1 1575 539 539 1044 ...
##  $ ALT               : Factor w/ 1297 levels "A","AAAAAAAAAAAAACT",..: 1 323 989 989 989 989 664 989 989 1 ...
##  $ ac_raw            : int  1 2 1 1 1 53 1 1 2 1 ...
##  $ af_raw            : num  0.000711 0.001555 0.000774 0.000711 0.000722 ...
##  $ an_raw            : int  1406 1286 1292 1406 1386 1416 1420 1414 1320 1400 ...
##  $ Consequence       : Factor w/ 78 levels "3_prime_UTR_variant",..: 66 28 28 28 66 28 28 28 28 28 ...
##  $ SIFT_call         : Factor w/ 4 levels "deleterious",..: NA 1 1 1 NA 1 1 1 1 1 ...
##  $ SIFT_score        : num  NA 0 0.01 0.04 NA 0.01 0 0.02 0.02 0.05 ...
##  $ PolyPhen_call     : Factor w/ 4 levels "benign","possibly_damaging",..: NA 3 3 3 NA 3 3 3 3 3 ...
##  $ PolyPhen_score    : num  NA 0.936 0.946 0.997 NA 0.932 0.999 0.991 0.999 0.988 ...
##  $ CLIN_SIG          : Factor w/ 120 levels "association",..: NA NA NA NA NA 2 NA NA NA NA ...
##  $ cDNA_position     : Factor w/ 16585 levels "1","?-1","10",..: 4302 3741 3476 13013 15367 5539 11506 11969 4210 15212 ...
##  $ CDS_position      : Factor w/ 15333 levels "1","?-1","10",..: 3744 3218 2977 10599 13061 4871 10484 10919 3279 13242 ...
##  $ Codons            : Factor w/ 3014 levels "-/A","-/AA","AAA/-",..: 1025 1858 1306 1172 1300 1864 2503 374 328 1877 ...
##  $ Protein_position  : Factor w/ 7360 levels "1","?-1","10",..: 6008 5707 5569 1723 2739 6606 1672 1845 5745 2811 ...
##  $ Amino_acids       : Factor w/ 1851 levels "-/*","*","A",..: 1171 334 1303 1134 1283 381 1809 1594 1578 225 ...
##  $ Existing_variation: Factor w/ 309385 levels "","FANCA:c.2534T>C",..: NA 54731 225315 291348 274420 10329 268628 74468 292466 49490 ...
##  $ Multiallelic      : logi  FALSE FALSE FALSE FALSE FALSE FALSE ...
##  $ ac_all            : int  1 2 1 1 1 34 1 1 2 1 ...
##  $ an_all            : num  912 942 954 876 906 894 954 944 956 810 ...
##  $ af_all            : num  0.0011 0.00212 0.00105 0.00114 0.0011 ...
##  $ ac_ubc            : int  0 0 1 0 0 20 0 1 0 1 ...
##  $ an_ubc            : num  472 488 490 450 462 460 488 484 490 410 ...
##  $ af_ubc            : num  0 0 0.00204 0 0 ...
##  $ ac_cbc            : int  1 2 0 1 1 14 1 0 2 0 ...
##  $ an_cbc            : num  440 454 464 426 444 434 466 460 466 400 ...
##  $ af_cbc            : num  0.00227 0.00441 0 0.00235 0.00225 ...
##  $ ac_ubc_fam        : int  0 0 0 0 0 4 0 0 0 1 ...
##  $ an_ubc_fam        : num  134 142 142 130 132 132 142 140 142 120 ...
##  $ af_ubc_fam        : num  0 0 0 0 0 ...
##  $ ac_ubc_nofam      : int  0 0 1 0 0 16 0 1 0 0 ...
##  $ an_ubc_nofam      : num  338 346 348 320 330 328 346 344 348 290 ...
##  $ af_ubc_nofam      : num  0 0 0.00287 0 0 ...
##  $ ac_cbc_fam        : int  0 1 0 0 0 5 0 0 2 0 ...
##  $ an_cbc_fam        : num  156 162 164 152 154 148 166 164 166 144 ...
##  $ af_cbc_fam        : num  0 0.00617 0 0 0 ...
##  $ ac_cbc_nofam      : int  1 1 0 1 1 9 1 0 0 0 ...
##  $ an_cbc_nofam      : num  284 292 300 274 290 286 300 296 300 256 ...
##  $ af_cbc_nofam      : num  0.00352 0.00342 0 0.00365 0.00345 ...
##  $ inverted          : logi  FALSE FALSE FALSE FALSE FALSE FALSE ...
##  $ ac_inv            : num  1 2 1 1 1 34 1 1 2 1 ...
##  $ an_inv            : num  912 942 954 876 906 894 954 944 956 810 ...
##  $ af_inv            : num  0.0011 0.00212 0.00105 0.00114 0.0011 ...
##  $ ac_cbc_inv        : num  1 2 0 1 1 14 1 0 2 0 ...
##  $ an_cbc_inv        : num  440 454 464 426 444 434 466 460 466 400 ...
##  $ af_cbc_inv        : num  0.00227 0.00441 0 0.00235 0.00225 ...
##  $ ac_ubc_inv        : num  0 0 1 0 0 20 0 1 0 1 ...
##  $ an_ubc_inv        : num  472 488 490 450 462 460 488 484 490 410 ...
##  $ af_ubc_inv        : num  0 0 0.00204 0 0 ...
##  $ ac_cbc_fam_inv    : num  0 1 0 0 0 5 0 0 2 0 ...
##  $ an_cbc_fam_inv    : num  156 162 164 152 154 148 166 164 166 144 ...
##  $ af_cbc_fam_inv    : num  0 0.00617 0 0 0 ...
##  $ ac_cbc_nofam_inv  : num  1 1 0 1 1 9 1 0 0 0 ...
##  $ an_cbc_nofam_inv  : num  284 292 300 274 290 286 300 296 300 256 ...
##  $ af_cbc_nofam_inv  : num  0.00352 0.00342 0 0.00365 0.00345 ...
##  $ ac_ubc_fam_inv    : num  0 0 0 0 0 4 0 0 0 1 ...
##  $ an_ubc_fam_inv    : num  134 142 142 130 132 132 142 140 142 120 ...
##  $ af_ubc_fam_inv    : num  0 0 0 0 0 ...
##  $ ac_ubc_nofam_inv  : num  0 0 1 0 0 16 0 1 0 0 ...
##  $ an_ubc_nofam_inv  : num  338 346 348 320 330 328 346 344 348 290 ...
##  $ af_ubc_nofam_inv  : num  0 0 0.00287 0 0 ...
##  $ weight            : num  25 25 23.8 25 25 ...
variants.df[1:5,1:5]
##                SplitVarID SYMBOL TYPE CHROM    POS
## Var000000026 Var000000026  NOC2L  SNP     1 881871
## Var000000029 Var000000029  NOC2L  SNP     1 883883
## Var000000031 Var000000031  NOC2L  SNP     1 883946
## Var000000053 Var000000053 KLHL17  SNP     1 897287
## Var000000064 Var000000064 KLHL17  SNP     1 897792
dim(phenotypes.df)
## [1] 478  36
str(phenotypes.df)
## 'data.frame':    478 obs. of  36 variables:
##  $ wes_id        : chr  "P1_A01" "P1_A02" "P1_A03" "P1_A04" ...
##  $ gwas_id       : chr  "id203568" "id357807" "id325472" "id304354" ...
##  $ merged_id     : chr  "P1_A01_id203568" "P1_A02_id357807" "P1_A03_id325472" "P1_A04_id304354" ...
##  $ filter        : chr  "pass" "pass" "pass" "pass" ...
##  $ cc            : int  1 1 1 1 1 1 0 1 1 0 ...
##  $ setno         : int  203568 357807 325472 304354 222648 244843 276284 297810 250898 226974 ...
##  $ registry      : Factor w/ 5 levels "de","IA","IR",..: 3 4 2 2 5 4 2 4 4 2 ...
##  $ family_history: int  1 0 1 1 1 1 1 1 1 0 ...
##  $ age_dx        : int  49 35 32 33 44 28 28 38 35 36 ...
##  $ age_ref       : num  58 36 41 34 49 28 32 44 35 38 ...
##  $ rstime        : num  10.17 1.83 9.75 1.59 5.66 ...
##  $ eig1_gwas     : num  -0.00389 -0.00379 -0.01011 -0.01288 -0.01086 ...
##  $ eig2_gwas     : num  0.00266 0.00246 -0.0001 0.00595 0.01157 ...
##  $ eig3_gwas     : num  0.06803 0.05055 -0.00603 0.00747 0.00144 ...
##  $ eig4_gwas     : num  -0.03469 -0.01264 -0.01269 -0.01841 0.00663 ...
##  $ eig5_gwas     : num  -0.03845 -0.00424 0.01597 -0.0065 0.01391 ...
##  $ stage         : num  1 2 2 1 1 1 2 1 2 1 ...
##  $ er            : num  NA 0 0 0 NA 1 1 1 1 0 ...
##  $ pr            : num  NA 0 0 NA NA 1 NA 1 0 0 ...
##  $ hist_cat      : chr  "lobular" "ductal" "medullary" "ductal" ...
##  $ hormone       : num  0 0 0 0 1 0 0 0 0 0 ...
##  $ chemo_cat     : chr  "no" "CMF" "Oth" "no" ...
##  $ br_xray       : int  1 1 0 0 1 0 0 0 1 1 ...
##  $ br_xray_dose  : num  1.6 0.83 0 0 0.77 0 0 0 1.1 0.83 ...
##  $ age_menarche  : num  9 13 10 12 10 13 12 11 11 NA ...
##  $ age_1st_ftp   : num  30 0 26 0 17 0 25 28 27 18 ...
##  $ age_menopause : num  -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 ...
##  $ num_preg      : num  1 0 1 0 1 0 1 1 1 1 ...
##  $ bmi_age18     : num  20.8 22.5 23.6 18.6 19.9 ...
##  $ bmi_dx        : num  25.8 22.9 28.3 17.8 26.6 ...
##  $ bmi_ref       : num  33.3 22.9 33.1 17.8 29.8 ...
##  $ eig1          : num  -0.1566 -0.13939 -0.00225 -0.00914 0.01841 ...
##  $ eig2          : num  0.08741 0.06847 -0.00239 0.0008 -0.02139 ...
##  $ eig3          : num  -0.0185 -0.0117 -0.0185 -0.0713 -0.0101 ...
##  $ eig4          : num  -0.03333 0.03332 0.01482 0.00719 -0.01141 ...
##  $ eig5          : num  0.02116 0.09035 0.00663 0.01628 -0.02406 ...
phenotypes.df[1:5,1:5]
##        wes_id  gwas_id       merged_id filter cc
## P1_A01 P1_A01 id203568 P1_A01_id203568   pass  1
## P1_A02 P1_A02 id357807 P1_A02_id357807   pass  1
## P1_A03 P1_A03 id325472 P1_A03_id325472   pass  1
## P1_A04 P1_A04 id304354 P1_A04_id304354   pass  1
## P1_A05 P1_A05 id222648 P1_A05_id222648   pass  1
# Check consistency of rownames and colnames
# Note that, in contrast to the source data, genes_aggr tables have already been syncronised
sum(colnames(genotypes.mx) != rownames(phenotypes.df))
## [1] 0
sum(colnames(genes_aggr_data.mx) != rownames(phenotypes.df))
## [1] 0
sum(rownames(genes_aggr_info.df) != rownames(genes_aggr_data.mx))
## [1] 0
sum(rownames(genes_aggr_info.df) != rownames(genes_aggr_kgen.df))
## [1] 0
sum(rownames(genes_aggr_info.df) != rownames(genes_aggr_exac.df))
## [1] 0
sum(rownames(genes_aggr_info.df) != rownames(genes_aggr_skat_M.df))
## [1] 0
sum(rownames(genes_aggr_info.df) != rownames(genes_aggr_skat_M_exac.df))
## [1] 0
sum(rownames(genotypes.mx) != rownames(genotypes_inv_imp_wt.mx))
## [1] 0
sum(rownames(genotypes.mx) != rownames(kgen.df))
## [1] 0
sum(rownames(genotypes.mx) != rownames(exac.df))
## [1] 0
sum(rownames(genotypes.mx) != rownames(variants.df))
## [1] 0

save_data

# Save data
save.image(paste(base_folder, "results", "r10c_SKAT_M_and_crude_trends.RData", sep="/"))

final_section

ls()
##  [1] "base_folder"               "exac.df"                  
##  [3] "genes_aggr_data.mx"        "genes_aggr_exac.df"       
##  [5] "genes_aggr_info.df"        "genes_aggr_kgen.df"       
##  [7] "genes_aggr_skat_M.df"      "genes_aggr_skat_M_exac.df"
##  [9] "genotypes_inv_imp_wt.mx"   "genotypes.mx"             
## [11] "kgen.df"                   "phenotypes.df"            
## [13] "selected_genes.df"         "variants.df"
sessionInfo()
## R version 3.3.3 (2017-03-06)
## Platform: x86_64-pc-linux-gnu (64-bit)
## Running under: Ubuntu 16.04.2 LTS
## 
## locale:
##  [1] LC_CTYPE=en_GB.UTF-8       LC_NUMERIC=C              
##  [3] LC_TIME=en_GB.UTF-8        LC_COLLATE=en_GB.UTF-8    
##  [5] LC_MONETARY=en_GB.UTF-8    LC_MESSAGES=en_GB.UTF-8   
##  [7] LC_PAPER=en_GB.UTF-8       LC_NAME=C                 
##  [9] LC_ADDRESS=C               LC_TELEPHONE=C            
## [11] LC_MEASUREMENT=en_GB.UTF-8 LC_IDENTIFICATION=C       
## 
## attached base packages:
## [1] stats     graphics  grDevices utils     datasets  methods   base     
## 
## other attached packages:
## [1] knitr_1.15.1
## 
## loaded via a namespace (and not attached):
##  [1] backports_1.0.5 magrittr_1.5    rprojroot_1.2   tools_3.3.3    
##  [5] htmltools_0.3.5 yaml_2.1.14     Rcpp_0.12.9     stringi_1.1.2  
##  [9] rmarkdown_1.3   stringr_1.2.0   digest_0.6.12   evaluate_0.10
Sys.time()
## [1] "2017-05-05 22:11:09 BST"